#' ---
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#' #Setup filenames

filename <- "India_Public Use" # !!!Update filename
functions_vers <-  "functions_1.7.R" # !!!Update helper functions file

#' #Setup data, functions and create dictionary for dataset review
source (functions_vers)
#'
#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Small Location: Location (<100,000) 
# Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#!!!Save flagged dictionary in .csv format, add "DatasetReview" to name and continue processing data with subset of flagged variables

#' #Direct PII: variables to be removed
# !!!No Direct PII

#' #Direct PII-team: Encode field team names
#' !!!No Direct PII-team

#' #Small locations: Encode locations  with pop <100,000 using random large numbers
#'  !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("b_Panchayat", "e_Panchayat") 
mydata <- encode_location (variables= locvars, missing=999999)

#' #Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" in dictionary of 30 or less. 

# Top code income variables

percentile_99.5 <- percentile_checker("earnmonth_tot_HH")
mydata <- top_recode (variable="earnmonth_tot_HH", break_point=40000, missing=NA)

percentile_99.5 <- percentile_checker("b_earnamonth")	
mydata <- top_recode (variable="b_earnamonth", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_earnbmonth")	
mydata <- top_recode (variable="b_earnbmonth", break_point=5000, missing=NA)

percentile_99.5 <- percentile_checker("b_earncmonth")	
mydata <- top_recode (variable="b_earncmonth", break_point=9000, missing=NA)

percentile_99.5 <- percentile_checker("b_earndmonth")	
mydata <- top_recode (variable="b_earndmonth", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_earnfmonth")	
mydata <- top_recode (variable="b_earnfmonth", break_point=7000, missing=NA)

percentile_99.5 <- percentile_checker("b_earngmonth")	
mydata <- top_recode (variable="b_earngmonth", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_earnhmonth")	
mydata <- top_recode (variable="b_earnhmonth", break_point=10000, missing=NA)

percentile_99.5 <- percentile_checker("b_earnimonth")	
mydata <- top_recode (variable="b_earnimonth", break_point=8000, missing=NA)

percentile_99.5 <- percentile_checker("b_earnjmonth")
mydata <- top_recode (variable="b_earnjmonth", break_point=180000, missing=NA)

percentile_99.5 <- percentile_checker("b_rentexpnew")	
mydata <- top_recode (variable="b_rentexpnew", break_point=4000, missing=NA)

percentile_99.5 <- percentile_checker("b_foodexpnew")	
mydata <- top_recode (variable="b_foodexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_clothesexpnew")	
mydata <- top_recode (variable="b_clothesexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_travelexpnew")	
mydata <- top_recode (variable="b_travelexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_healthexpnew")	
mydata <- top_recode (variable="b_healthexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_schoolexpnew")	
mydata <- top_recode (variable="b_schoolexpnew", break_point=12000, missing=NA)

percentile_99.5 <- percentile_checker("b_utilityexpnew")	
mydata <- top_recode (variable="b_utilityexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_phoneexpnew")	
mydata <- top_recode (variable="b_phoneexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_alcoholexpnew")	
mydata <- top_recode (variable="b_alcoholexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_recrexpnew")	
mydata <- top_recode (variable="b_recrexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_debtexpnew")	
mydata <- top_recode (variable="b_debtexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_otherexpnew")	
mydata <- top_recode (variable="b_otherexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("b_landinc", missing=c(66666, 88888))	
mydata <- top_recode (variable="b_landinc", break_point=percentile_99.5, missing=c(66666, 88888))

percentile_99.5 <- percentile_checker("e_earnamonth")	
mydata <- top_recode (variable="e_earnamonth", break_point=2000, missing=NA)

percentile_99.5 <- percentile_checker("e_earnbmonth")	
mydata <- top_recode (variable="e_earnbmonth", break_point=1000, missing=NA)

percentile_99.5 <- percentile_checker("e_earncmonth")	
mydata <- top_recode (variable="e_earncmonth", break_point=6000, missing=NA)

percentile_99.5 <- percentile_checker("e_earndmonth")	
mydata <- top_recode (variable="e_earndmonth", break_point=10000, missing=NA)

percentile_99.5 <- percentile_checker("e_earnemonth")	
mydata <- top_recode (variable="e_earnemonth", break_point=2000, missing=NA)

percentile_99.5 <- percentile_checker("e_earnfmonth")	
mydata <- top_recode (variable="e_earnfmonth", break_point=4000, missing=NA)

percentile_99.5 <- percentile_checker("e_earngmonth")	
mydata <- top_recode (variable="e_earngmonth", break_point=500, missing=NA)

percentile_99.5 <- percentile_checker("e_earnhmonth")	
mydata <- top_recode (variable="e_earnhmonth", break_point=5000, missing=NA)

percentile_99.5 <- percentile_checker("e_earnimonth")	
mydata <- top_recode (variable="e_earnimonth", break_point=5000, missing=NA)

percentile_99.5 <- percentile_checker("e_earnjmonth")	
mydata <- top_recode (variable="e_earnjmonth", break_point=10000, missing=NA)

percentile_99.5 <- percentile_checker("e_rentexpnew")	
mydata <- top_recode (variable="e_rentexpnew", break_point=1000, missing=NA)

percentile_99.5 <- percentile_checker("e_foodexpnew")	
mydata <- top_recode (variable="e_foodexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("e_clothesexpnew")	
mydata <- top_recode (variable="e_clothesexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("e_travelexpnew")	
mydata <- top_recode (variable="e_travelexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("e_healthexpnew")	
mydata <- top_recode (variable="e_healthexpnew", break_point=7500, missing=NA)

percentile_99.5 <- percentile_checker("e_schoolexpnew")	
mydata <- top_recode (variable="e_schoolexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("e_utilityexpnew")	
mydata <- top_recode (variable="e_utilityexpnew", break_point=400, missing=NA)

percentile_99.5 <- percentile_checker("e_phoneexpnew")	
mydata <- top_recode (variable="e_phoneexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("e_alcoholexpnew")	
mydata <- top_recode (variable="e_alcoholexpnew", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("e_recrexpnew")	
mydata <- top_recode (variable="e_recrexpnew", break_point=1200, missing=NA)

percentile_99.5 <- percentile_checker("e_debtexpnew")	
mydata <- top_recode (variable="e_debtexpnew", break_point=9000, missing=NA)

percentile_99.5 <- percentile_checker("e_otherexpnew")	
mydata <- top_recode (variable="e_otherexpnew", break_point=500, missing=NA)

percentile_99.5 <- percentile_checker("e_landinc")	
mydata <- top_recode (variable="e_landinc", break_point=percentile_99.5, missing=NA)

percentile_99.5 <- percentile_checker("HHincome_percapita")	
mydata <- top_recode (variable="HHincome_percapita", break_point=5000, missing=NA)

mydata <- top_recode (variable="HHsize", break_point=20, missing=NA)
mydata <- top_recode (variable="b_numfamily", break_point=20, missing=NA)

#percentile_checker("b_advamt", missing=88888)	
mydata <- top_recode (variable="b_advamt", break_point=10000, missing=88888)

#percentile_checker("e_cellpone")	
mydata <- top_recode (variable="e_cellpone", break_point=10, missing=NA)

mydata <- top_recode (variable="e_computer", break_point=2, missing=NA)
mydata <- top_recode (variable="e_bicycle", break_point=2, missing=NA)
mydata <- top_recode (variable="e_motorcycle", break_point=2, missing=NA)
mydata <- top_recode (variable="e_car", break_point=1, missing=NA)
mydata <- top_recode (variable="e_tv", break_point=2, missing=NA)
mydata <- top_recode (variable="e_cow", break_point=50, missing=NA)
mydata <- top_recode (variable="e_bullocks", break_point=10, missing=NA)
mydata <- top_recode (variable="e_buffalo", break_point=10, missing=NA)
mydata <- top_recode (variable="e_sheep", break_point=10, missing=NA)
mydata <- top_recode (variable="e_sheep", break_point=10, missing=NA)
mydata <- top_recode (variable="e_chicken", break_point=20, missing=NA)

# Drop identifiers for small religious minorities

dropvars <- c("b_rel3", "e_rel3", "e_rel4")
mydata <- mydata[!names(mydata) %in% dropvars]

#' #Indirect PII - Categ0rical: Recode, encode, or Top/bottom coding for extreme values

break_rel <- c(1,2,3,8)
labels_rel <- c("Hinduism" = 1,
                "Islam" = 2,
                "Other" = 3, 
                "Unsure" = 4)
mydata <- ordinal_recode (variable="rel", break_points=break_rel, missing=999999, value_labels=labels_rel)

break_mar <- c(1,2,3,4)
labels_mar <- c("Never Married" = 1,
                "Currently married" = 2,
                "Currently married but no guana" = 3, 
                "Widowed/Divorced/Separated" = 4)
mydata <- ordinal_recode (variable="b_marstat", break_points=break_mar, missing=999999, value_labels=labels_mar)
mydata2 <- ordinal_recode (variable="e_marstat", break_points=break_mar, missing=999999, value_labels=labels_mar)

# !!!Include relevant variables in list below (Indirect PII - Categorical, and Ordinal if not processed yet)

indirect_PII <- c("b_Interviewer_Q4_2",
                  "b_Interviewer_Q4_3",
                  "b_Interviewer_Q4_4",
                  "b_Interviewer_Q4_5",
                  "b_Interviewer_Q4_6",
                  "b_Interviewer_Q4_7",
                  "b_Interviewer_Q4_8",
                  "e_nenroll_masked",
                  "e_seas_2",
                  "e_natl_masked",
                  "e_disab",
                  "e_impairment",
                  "e_impairment_1",
                  "e_impairment_2",
                  "e_impairment_3",
                  "e_impairment_4",
                  "e_impairment_5",
                  "e_impairment_6",
                  "e_impairment_7",
                  "e_impairment_8")

capture_tables (indirect_PII)

# Recode those with very specific values. 

legis_mem1_label <- "Member of gram panchayat, other local, state or national legislative body or school management committee"
var_label (mydata$b_legis_mem1) <- legis_mem1_label
var_label (mydata$b_W2_legis1) <- legis_mem1_label
var_label (mydata$e_legis_mem1) <- legis_mem1_label
var_label (mydata$e_W2_legis2) <- legis_mem1_label

dropvars <- c("b_legis_mem2",
             "b_legis_mem3",
             "b_legis_mem4",
             "b_legis_mem5",
             "b_legis_mem6",
             "b_W2_legis2",
             "b_W2_legis3",
             "b_W2_legis4",
             "b_W2_legis5",
             "e_legis_mem2",
             "e_legis_mem3",
             "e_legis_mem4",
             "e_legis_mem5",
             "e_legis_mem6",
             "e_W2_legis2",
             "e_W2_legis3",
             "e_W2_legis4",
             "e_W2_legis5")
mydata <- mydata[!names(mydata) %in% dropvars] # Drop specific flags for type of government position held as strong identifier

dropvars <- c("e_impairment",
              "e_impairment_1",
              "e_impairment_2",
              "e_impairment_3",
              "e_impairment_4",
              "e_impairment_5",
              "e_impairment_6",
              "e_impairment_7",
              "b_Interviewer_Q4_2",
              "b_Interviewer_Q4_3",
              "b_Interviewer_Q4_4",
              "b_Interviewer_Q4_5",
              "b_Interviewer_Q4_6",
              "b_Interviewer_Q4_7")
mydata <- mydata[!names(mydata) %in% dropvars] # Drop specific disabilities as strong identifier

#' #Matching and crosstabulations: Run automated PII check 

# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('gender', 'age_masked', 'edu_control') ##!!! Replace with candidate categorical demo vars

# weight variable (add if available)
# selectedWeightVar = c('projwt') ##!!! Replace with weight var

# household id variable (cluster)
selectedHouseholdID = c('hhid') ##!!! Replace with household id

# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, 
                           keyVars = selectedKeyVars, 
                           hhId = selectedHouseholdID)
sdcInitial # No records violate 2-anonimity

#' #Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("b_migpurp",
               "b_BBA_specify_masked",
               "b_Fr1_topic",
               "b_Fr2_topic",
               "b_Fr3_topic",
               "b_Fr4_topic",
               "b_Fr5_topic",
               "b_W2_rnd_work_sch_why",
               "b_Interviewer_Q2_masked")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata$b_BBA_specify_masked[35727] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35729] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35731] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35732] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35736] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35737] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35738] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35739] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35740] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35742] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[35743] <- "In Panchayat bhawan and [location name redacted]."
mydata$b_BBA_specify_masked[19084] <- "I've seen it written on the wall in [location name redacted]"
mydata$b_BBA_specify_masked[19086] <- "I've seen it written on the wall in [location name redacted]"
mydata$b_BBA_specify_masked[19088] <- "I've seen it written on the wall in [location name redacted]"
mydata$b_BBA_specify_masked[19091] <- "I've seen it written on the wall in [location name redacted]"
mydata$b_BBA_specify_masked[19092] <- "I've seen it written on the wall in [location name redacted]"
mydata$b_BBA_specify_masked[19093] <- "I've seen it written on the wall in [location name redacted]"
mydata$b_BBA_specify_masked[19097] <- "I've seen it written on the wall in [location name redacted]"
mydata$b_BBA_specify_masked[15372] <- "Have seen a procession towards [location name redacted]"
mydata$b_BBA_specify_masked[15373] <- "Have seen a procession towards [location name redacted]"
mydata$b_BBA_specify_masked[15375] <- "Have seen a procession towards [location name redacted]"
mydata$b_BBA_specify_masked[15380] <- "Have seen a procession towards [location name redacted]"
mydata$b_BBA_specify_masked[15381] <- "Have seen a procession towards [location name redacted]"
mydata$b_BBA_specify_masked[15385] <- "Have seen a procession towards [location name redacted]"

mydata$b_migpurp[13630] <- "Went to [location redacted] to stay"
mydata$b_migpurp[25055] <- "[Individual's name redacted]"
mydata$b_migpurp[25511] <- "[Individual's name redacted]"
mydata$b_migpurp[25518] <- "[Individual's name redacted]"
mydata$b_migpurp[35158] <- "[Individual's name redacted] has gone for education"

mydata$b_Interviewer_Q2_masked[19178] <- "[Location name redacted]"
mydata$b_Interviewer_Q2_masked[19179] <- "[Location name redacted]"
mydata$b_Interviewer_Q2_masked[19180] <- "[Location name redacted]"
mydata$b_Interviewer_Q2_masked[19181] <- "[Location name redacted]"
mydata$b_Interviewer_Q2_masked[19182] <- "[Location name redacted]"
mydata$b_Interviewer_Q2_masked[19183] <- "[Location name redacted]"
mydata$b_Interviewer_Q2_masked[19188] <- "[Location name redacted]"
mydata$b_Interviewer_Q2_masked[19192] <- "[Location name redacted]"

#' #GPS data: Displace
# !!! No GPS data

#' #Save processed data in Stata and SPSS format
#' Adds "_PU" (Public Use) to the end of the name 

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav"))

# Add report title dynamically
title_var <- paste0("DOL-ILAB SDC - ", filename)
#'---
#'  title: `r title_var`
#'---
